*****************************************************************************
**                      Replication do-file for article                    **
** Continuity Trumps? The impact of interviewer change on item nonresponse **
**                  authors: Kristin Hajek & Nina Schumann                 ** 
*****************************************************************************

**********************
** 4) Define Sample **
**********************

set matsize 700

use "$Intwechsel\3_nonres_2.dta", clear
sort id wave

drop if demodiff==1
count	// 53,447

*****************************************************************************

** a) drop person years after second interviewer change:

* number of changes per person
bysort id: egen sum_change=total(intchange)

* censor cases after first interviewer change
sort id wave	
bysort id (wave): gen changenr = sum((intchange!=intchange[_n-1] | (intchange==1 & intchange[_n-1]==1)) & _n!=1 & intchange!=0) 
		// variable changenr indicates how many interviewer changes happened so far
drop if changenr>1  // drop cases (i.e. interviews, not persons) after second interviewer change

* how many changes per person now?
drop sum_change
bysort id: egen sum_change=total(intchange)
tab sum_change, m  // 1 change at max.

count	// 51,518

*****************************************************************************

** b) generate important independent variables (no missings allowed!)

* sex respondent
* variable sex_gen is already harmonised over the waves 
tab sex_gen, gen(male)
drop if sex_gen==-4  // drop if change of sex
count	// 51,507
drop male1 male3
ren male2 male
tab male, m

* check sex interviewer - changes over waves?
sort id wave
gen changesex=0
replace changesex=1 if id==id[_n-1] & intsex!=intsex[_n-1] & intid==intid[_n-1]
tab changesex // 382 cases
tab intid if changesex==1 // 10 interviewers change sex between waves
sort intid wave
*l intid id wave intsex if inlist(intid,1147,1216,1228,1230,1257,1301,1339,1357,1378,1410), sepby(intid)
* some wrong entries in wave 1, following waves are coherent => correct mistakes in wave 1
replace intsex=2 if inlist(intid,1228,1357)
replace intsex=1 if inlist(intid,1147,1216,1230,1257,1301,1339,1378,1410)
drop changesex

* sex interviewer --> intmale
gen intmale=0 if intsex==2
replace intmale=1 if intsex==1
tab intmale, m

* sex difference respondent-interviewer
gen m_intf=0
gen m_intm=0
gen f_intm=0
gen f_intf=0
replace m_intf=1 if male==1 & intmale==0
replace m_intm=1 if male==1 & intmale==1
replace f_intm=1 if male==0 & intmale==1
replace f_intf=1 if male==0 & intmale==0

* age interviewer and respondent
tab intage, m   // no missings
tab age, m // no missings

* check age interviewer - implausible changes over waves?
sort id wave
gen agecheck=intage-wave
gen changeage=0
replace changeage=1 if id==id[_n-1] & (agecheck>agecheck[_n-1]+1 |  agecheck<agecheck[_n-1]-1) & intid==intid[_n-1]
tab changeage, m // 169 cases
tab intid if changeage==1 // 6 interviewers with implausible age differences between waves
sort intid wave
* l intid id wave intage agecheck if inlist(intid,1147,1216,1230,1301,1378,1410), sepby(intid)
* some wrong entries in wave 1, following waves are coherent => correct mistakes in wave 1
replace intage=65 if intid==1147 & wave==1
replace intage=65 if intid==1216 & wave==1
replace intage=44 if intid==1230 & wave==1
replace intage=61 if intid==1301 & wave==1
replace intage=65 if intid==1378 & wave==1
replace intage=50 if intid==1410 & wave==1
drop changeage agecheck

* age difference
gen agediff=intage-age if age<intage & intage>0
replace agediff=age-intage if intage<age & intage>0 
replace agediff=0 if age==intage & age>0 & intage>0
tab agediff, m
l age intage if agediff==.	// okay

* number of interviews: two groups (dummy) --> more or less than median number
gen intnum_h=1
forvalues x=1/7 {
	sum intnum if wave==`x', det
	_pctile intnum if wave==`x', nq(2)
	return list	
	replace intnum_h=0 if intnum<r(r1) & wave==`x'
}

* variable participation: indicates participation since wave 1 (instead of wave and gap variable) 
bysort id (wave): gen participation=_n
* dummy: gap in previous wave
sort id wave
gen last_nonres=0
replace last_nonres=1 if participation!=wave & wave[_n-1]<wave-1
cap drop help
bysort id: egen help=sum(last_nonres)
tab help, m
drop help

* wave dummies
tab wave, gen(wave)

* cohort dummies
tab cohort, gen(c)

* interviewer encounter dummies
sort id wave
tab sum_change, m // ever interviewer change?

gen help=0
forvalues x=1/7 {
	replace help=`x' if intchange==1 & wave==`x'
} // wave of interviewer change
bysort id: egen wave_change=total(help)

* first interviewer
gen first_int=changenr
recode first_int 0=1 1=0
gen second_int=changenr

* number of encounters with first interviewer
gen interv_enc_first=first_int*participation
tab interv_enc_first, gen(interv_enc_first) // Dummies
drop interv_enc_first1 // =0
forvalues x=2/8 {
	local y=`x'-1
	rename interv_enc_first`x' interv_enc_first`y'
}

* number of encounters with second interviewer
gen interv_enc_second=0
replace interv_enc_second=1 if intchange==1
sort id wave
forvalues x=1/8 {
	local y=`x'+1
replace interv_enc_second=`y' if interv_enc_second[_n-1]==`x' & second_int==1 & id==id[_n-1]
}
tab interv_enc_second, gen(interv_enc_second) // Dummies
* check: 
* l id wave first_int second_int interv_enc_first interv_enc_second participation help33, sepby(id)
drop help*
drop interv_enc_second1 // =0
forvalues x=2/7 {
	local y=`x'-1
	rename interv_enc_second`x' interv_enc_second`y'
}
for num 1/7: tab interv_enc_firstX participation, m
for num 1/6: tab interv_enc_secondX participation, m
*l id wave intchange first_int second_int interv_enc_second* if inlist(id,3040000,749731000,749197000), sepby(id)

gen interv_enc=interv_enc_first if first_int==1  // interviewer encounter with current interviewer
replace interv_enc=interv_enc_second if first_int==0

tab interv_enc, gen(interv_enc)
			
* group encounter with second_int: 4+
tab1 interv_enc_second?, m
replace interv_enc_second4=1 if interv_enc_second5==1 | interv_enc_second6==1
tab interv_enc_second4, m
drop interv_enc_second5 interv_enc_second6

* interaction first encounter and mobility
gen interv_enc_second1_dist=interv_enc_second1*move_dist
gen interv_enc_second1_close=interv_enc_second1*move_close

* interaction first encounter and gap
gen interv_enc_second1_gap=interv_enc_second1*last_nonres

* interaction first encounter and wave 
gen interv_enc_second1_w=interv_enc_second1*wave
tab interv_enc_second1_w, gen(interv_enc_second1_w)


*****************************************************************************

** c) Drop person years with missing on important var. 

mvpatterns missings* intage agediff m_intm f_intm intnum* move wave 
		// missings only for income variables due to variable construction (missing vs. valid answer)

gen help=0
replace help=1 if missing(missingsPC_CAPI_1,missingsPC_CAPI_2,missings_Eink,intage,agediff,m_intm,f_intm,intnum,move,wave)  
drop if help==1
drop help

count	// 50,144

*****************************************************************************

** d) Keep only first obs before second int for people with interviewer change

bysort id: egen maxchange=max(second_int) // ever change?
gen last_first=.
bysort id (wave): replace last_first=1 if second_int[_n+1]==1 & interv_enc[_n+1]==1 & second_int==0  
drop if maxchange==1 & second_int==0 & last_first!=1 // (4,698 observations deleted)
count	// 45,446

*****************************************************************************

** e) Limit sample to persons who participated at least in 2 waves

bysort id:   gen wavecount = _N     //# of person-years (within person)
tab wavecount if wave==1
keep if wavecount > 1               //only those with 2 or more person-years are kept

count 	// 42,826

*****************************************************************************

** f) Exclude outliers with at least 10% nonresponse (either -1 "don't know" or -2 "no answer") in at least one wave

* don't knows	
sum missingsPC_CAPI_1, det
count if missingsPC_CAPI_1>10 & missingsPC_CAPI_1<100  
gen outlier1=1 if missingsPC_CAPI_1>10 & missingsPC_CAPI_1<100 
bysort id: egen outlier1_n = max(outlier1) 
tab outlier1_n, m	// 602 person years  
drop if outlier1_n==1

* refusals
sum missingsPC_CAPI_2, det
count if missingsPC_CAPI_2>10 & missingsPC_CAPI_2<100  
gen outlier2=1 if missingsPC_CAPI_2>10 & missingsPC_CAPI_2<100 
bysort id: egen outlier2_n = max(outlier2) 
tab outlier2_n, m	// 1,057 person years
drop if outlier2_n==1

drop outlier*

count	// 41,167

*****************************************************************************

** g) Define as panel data

sort id wave
xtset id wave

xtdes, pattern(20)		

* enough within variation?
xtsum missingsPC_CAPI intchange intage m_intm f_intm move

sort id wave
save  "$Intwechsel\4_sample.dta", replace





